<!DOCTYPE html>
<html lang="zh-CN">
<head>
  <meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=2">
<meta name="theme-color" content="#222">
<meta name="generator" content="Hexo 4.2.1">
  <link rel="apple-touch-icon" sizes="180x180" href="/images/apple-touch-icon-next.png">
  <link rel="icon" type="image/png" sizes="32x32" href="/images/%E6%AD%A6%E6%B1%8932x32.png">
  <link rel="icon" type="image/png" sizes="16x16" href="/images/%E6%AD%A6%E6%B1%8916x16.png">
  <link rel="mask-icon" href="/images/logo.svg" color="#222">

<link rel="stylesheet" href="/css/main.css">


<link rel="stylesheet" href="/lib1/font-awesome/css/all.min.css">
  <link rel="stylesheet" href="/lib1/pace/pace-theme-center-simple.min.css">
  <script src="/lib1/pace/pace.min.js"></script>

<script id="hexo-configurations">
    var NexT = window.NexT || {};
    var CONFIG = {"hostname":"example.com","root":"/","scheme":"Gemini","version":"7.8.0","exturl":false,"sidebar":{"position":"left","display":"post","padding":18,"offset":12,"onmobile":false},"copycode":{"enable":true,"show_result":true,"style":null},"back2top":{"enable":true,"sidebar":false,"scrollpercent":false},"bookmark":{"enable":false,"color":"#222","save":"auto"},"fancybox":false,"mediumzoom":false,"lazyload":false,"pangu":false,"comments":{"style":"tabs","active":null,"storage":true,"lazyload":false,"nav":null},"algolia":{"hits":{"per_page":10},"labels":{"input_placeholder":"Search for Posts","hits_empty":"We didn't find any results for the search: ${query}","hits_stats":"${hits} results found in ${time} ms"}},"localsearch":{"enable":true,"trigger":"auto","top_n_per_article":1,"unescape":false,"preload":false},"motion":{"enable":true,"async":false,"transition":{"post_block":"fadeIn","post_header":"slideDownIn","post_body":"slideDownIn","coll_header":"slideLeftIn","sidebar":"slideUpIn"}},"path":"search.xml"};
  </script>

  <meta name="description" content="1、语言模型与多层感知机和能有效处理空间信息的卷积神经网络不同，循环神经网络是为了更好地处理时序信息而设计的。引入状态变量来存储过去的信息，并用其与当前的输入共同决定当前的输出。 语言模型：可将自然语言文本看作一段离散的时间序列，假设一段长度为T的文本中的词依次为W1、W2….Wt，那么在离散的时间序列中，Wt可看作在时间步t的输出或标签。给定一个长度为T的词的序列，语言模型将计算该序列概率：P(">
<meta property="og:type" content="article">
<meta property="og:title" content="循环神经网络">
<meta property="og:url" content="http://example.com/2020/08/21/%E6%9C%BA%E5%99%A8%E5%AD%A6%E4%B9%A0/%E5%BE%AA%E7%8E%AF%E7%A5%9E%E7%BB%8F%E7%BD%91%E7%BB%9C/index.html">
<meta property="og:site_name" content="Technological Blog">
<meta property="og:description" content="1、语言模型与多层感知机和能有效处理空间信息的卷积神经网络不同，循环神经网络是为了更好地处理时序信息而设计的。引入状态变量来存储过去的信息，并用其与当前的输入共同决定当前的输出。 语言模型：可将自然语言文本看作一段离散的时间序列，假设一段长度为T的文本中的词依次为W1、W2….Wt，那么在离散的时间序列中，Wt可看作在时间步t的输出或标签。给定一个长度为T的词的序列，语言模型将计算该序列概率：P(">
<meta property="og:locale" content="zh_CN">
<meta property="og:image" content="http://example.com/image/%E5%BE%AA%E7%8E%AF%E7%A5%9E%E7%BB%8F%E7%BD%91%E7%BB%9C/%E9%9A%90%E8%97%8F%E7%8A%B6%E6%80%81%E5%BE%AA%E7%8E%AF%E7%A5%9E%E7%BB%8F%E7%BD%91%E7%BB%9C.png">
<meta property="og:image" content="http://example.com/image/%E5%BE%AA%E7%8E%AF%E7%A5%9E%E7%BB%8F%E7%BD%91%E7%BB%9C/%E5%AD%97%E7%AC%A6%E7%BA%A7%E5%BE%AA%E7%8E%AF%E7%A5%9E%E7%BB%8F%E7%BD%91%E7%BB%9C.png">
<meta property="og:image" content="http://example.com/image/%E5%BE%AA%E7%8E%AF%E7%A5%9E%E7%BB%8F%E7%BD%91%E7%BB%9C/%E5%8F%8D%E5%90%91%E4%BC%A0%E6%92%AD.png">
<meta property="og:image" content="http://example.com/image/%E5%BE%AA%E7%8E%AF%E7%A5%9E%E7%BB%8F%E7%BD%91%E7%BB%9C/%E9%87%8D%E7%BD%AE%E9%97%A8%E4%B8%8E%E6%9B%B4%E6%96%B0%E9%97%A8.png">
<meta property="og:image" content="http://example.com/image/%E5%BE%AA%E7%8E%AF%E7%A5%9E%E7%BB%8F%E7%BD%91%E7%BB%9C/%E5%80%99%E9%80%89%E9%9A%90%E8%97%8F%E7%8A%B6%E6%80%81.png">
<meta property="og:image" content="http://example.com/image/%E5%BE%AA%E7%8E%AF%E7%A5%9E%E7%BB%8F%E7%BD%91%E7%BB%9C/%E9%9A%90%E8%97%8F%E7%8A%B6%E6%80%81.png">
<meta property="article:published_time" content="2020-08-21T14:33:17.000Z">
<meta property="article:modified_time" content="2021-11-03T08:23:42.984Z">
<meta property="article:author" content="Li Yudong">
<meta property="article:tag" content="机器学习初学">
<meta name="twitter:card" content="summary">
<meta name="twitter:image" content="http://example.com/image/%E5%BE%AA%E7%8E%AF%E7%A5%9E%E7%BB%8F%E7%BD%91%E7%BB%9C/%E9%9A%90%E8%97%8F%E7%8A%B6%E6%80%81%E5%BE%AA%E7%8E%AF%E7%A5%9E%E7%BB%8F%E7%BD%91%E7%BB%9C.png">

<link rel="canonical" href="http://example.com/2020/08/21/%E6%9C%BA%E5%99%A8%E5%AD%A6%E4%B9%A0/%E5%BE%AA%E7%8E%AF%E7%A5%9E%E7%BB%8F%E7%BD%91%E7%BB%9C/">


<script id="page-configurations">
  // https://hexo.io/docs/variables.html
  CONFIG.page = {
    sidebar: "",
    isHome : false,
    isPost : true,
    lang   : 'zh-CN'
  };
</script>

  <title>循环神经网络 | Technological Blog</title>
  






  <noscript>
  <style>
  .use-motion .brand,
  .use-motion .menu-item,
  .sidebar-inner,
  .use-motion .post-block,
  .use-motion .pagination,
  .use-motion .comments,
  .use-motion .post-header,
  .use-motion .post-body,
  .use-motion .collection-header { opacity: initial; }

  .use-motion .site-title,
  .use-motion .site-subtitle {
    opacity: initial;
    top: initial;
  }

  .use-motion .logo-line-before i { left: initial; }
  .use-motion .logo-line-after i { right: initial; }
  </style>
</noscript>

</head>

<body itemscope itemtype="http://schema.org/WebPage">
  <div class="container use-motion">
    <div class="headband"></div>

    <header class="header" itemscope itemtype="http://schema.org/WPHeader">
      <div class="header-inner"><div class="site-brand-container">
  <div class="site-nav-toggle">
    <div class="toggle" aria-label="切换导航栏">
      <span class="toggle-line toggle-line-first"></span>
      <span class="toggle-line toggle-line-middle"></span>
      <span class="toggle-line toggle-line-last"></span>
    </div>
  </div>

  <div class="site-meta">

    <a href="/" class="brand" rel="start">
      <span class="logo-line-before"><i></i></span>
      <h1 class="site-title">Technological Blog</h1>
      <span class="logo-line-after"><i></i></span>
    </a>
      <p class="site-subtitle" itemprop="description">IT小白的成长之旅</p>
  </div>

  <div class="site-nav-right">
    <div class="toggle popup-trigger">
        <i class="fa fa-search fa-fw fa-lg"></i>
    </div>
  </div>
</div>




<nav class="site-nav">
  <ul id="menu" class="main-menu menu">
        <li class="menu-item menu-item-about">

    <a href="/about/" rel="section"><i class="user fa-fw"></i>关于</a>

  </li>
        <li class="menu-item menu-item-tags">

    <a href="/tags/" rel="section"><i class="tags fa-fw"></i>标签<span class="badge">18</span></a>

  </li>
        <li class="menu-item menu-item-categories">

    <a href="/categories/" rel="section"><i class="th fa-fw"></i>分类<span class="badge">14</span></a>

  </li>
        <li class="menu-item menu-item-archives">

    <a href="/archives/" rel="section"><i class="archive fa-fw"></i>归档<span class="badge">95</span></a>

  </li>
      <li class="menu-item menu-item-search">
        <a role="button" class="popup-trigger"><i class="fa fa-search fa-fw"></i>搜索
        </a>
      </li>
  </ul>
</nav>



  <div class="search-pop-overlay">
    <div class="popup search-popup">
        <div class="search-header">
  <span class="search-icon">
    <i class="fa fa-search"></i>
  </span>
  <div class="search-input-container">
    <input autocomplete="off" autocapitalize="off"
           placeholder="搜索..." spellcheck="false"
           type="search" class="search-input">
  </div>
  <span class="popup-btn-close">
    <i class="fa fa-times-circle"></i>
  </span>
</div>
<div id="search-result">
  <div id="no-result">
    <i class="fa fa-spinner fa-pulse fa-5x fa-fw"></i>
  </div>
</div>

    </div>
  </div>

</div>
    </header>

    
  <div class="back-to-top">
    <i class="fa fa-arrow-up"></i>
    <span>0%</span>
  </div>

  <a href="https://github.com/tiarmor1" class="github-corner" title="Follow me on GitHub" aria-label="Follow me on GitHub" rel="noopener" target="_blank"><svg width="80" height="80" viewBox="0 0 250 250" aria-hidden="true"><path d="M0,0 L115,115 L130,115 L142,142 L250,250 L250,0 Z"></path><path d="M128.3,109.0 C113.8,99.7 119.0,89.6 119.0,89.6 C122.0,82.7 120.5,78.6 120.5,78.6 C119.2,72.0 123.4,76.3 123.4,76.3 C127.3,80.9 125.5,87.3 125.5,87.3 C122.9,97.6 130.6,101.9 134.4,103.2" fill="currentColor" style="transform-origin: 130px 106px;" class="octo-arm"></path><path d="M115.0,115.0 C114.9,115.1 118.7,116.5 119.8,115.4 L133.7,101.6 C136.9,99.2 139.9,98.4 142.2,98.6 C133.8,88.0 127.5,74.4 143.8,58.0 C148.5,53.4 154.0,51.2 159.7,51.0 C160.3,49.4 163.2,43.6 171.4,40.1 C171.4,40.1 176.1,42.5 178.8,56.2 C183.1,58.6 187.2,61.8 190.9,65.4 C194.5,69.0 197.7,73.2 200.1,77.6 C213.8,80.2 216.3,84.9 216.3,84.9 C212.7,93.1 206.9,96.0 205.4,96.6 C205.1,102.4 203.0,107.8 198.3,112.5 C181.9,128.9 168.3,122.5 157.7,114.1 C157.9,116.9 156.7,120.9 152.7,124.9 L141.0,136.5 C139.8,137.7 141.6,141.9 141.8,141.8 Z" fill="currentColor" class="octo-body"></path></svg></a>


    <main class="main">
      <div class="main-inner">
        <div class="content-wrap">
          

          <div class="content post posts-expand">
            

    
  
  

  <article itemscope itemtype="http://schema.org/Article" class="post-block" lang="zh-CN">
    <link itemprop="mainEntityOfPage" href="http://example.com/2020/08/21/%E6%9C%BA%E5%99%A8%E5%AD%A6%E4%B9%A0/%E5%BE%AA%E7%8E%AF%E7%A5%9E%E7%BB%8F%E7%BD%91%E7%BB%9C/">

    <span hidden itemprop="author" itemscope itemtype="http://schema.org/Person">
      <meta itemprop="image" content="/images/author.jpg">
      <meta itemprop="name" content="Li Yudong">
      <meta itemprop="description" content="">
    </span>
    
    <span hidden itemprop="publisher" itemscope itemtype="http://schema.org/Organization">
      <meta itemprop="name" content="Technological Blog">
    </span>
      <header class="post-header">
        <h1 class="post-title" itemprop="name headline">
          循环神经网络
        </h1>
    
        <div class="post-meta">
            <span class="post-meta-item">
              <span class="post-meta-item-icon">
                <i class="far fa-calendar"></i>
              </span>
              <span class="post-meta-item-text">发表于</span>
    
              <time title="创建时间：2020-08-21 22:33:17" itemprop="dateCreated datePublished" datetime="2020-08-21T22:33:17+08:00">2020-08-21</time>
            </span>
              <span class="post-meta-item">
                <span class="post-meta-item-icon">
                  <i class="far fa-calendar-check"></i>
                </span>
                <span class="post-meta-item-text">更新于</span>
                <time title="修改时间：2021-11-03 16:23:42" itemprop="dateModified" datetime="2021-11-03T16:23:42+08:00">2021-11-03</time>
              </span>
            <span class="post-meta-item">
              <span class="post-meta-item-icon">
                <i class="far fa-folder"></i>
              </span>
              <span class="post-meta-item-text">分类于</span>
                <span itemprop="about" itemscope itemtype="http://schema.org/Thing">
                  <a href="/categories/%E6%9C%BA%E5%99%A8%E5%AD%A6%E4%B9%A0/" itemprop="url" rel="index"><span itemprop="name">机器学习</span></a>
                </span>
            </span>
    
          
    
        </div>
      </header>
    
    
    
    
    <div class="post-body" itemprop="articleBody">
    
      
        <h2 id="1、语言模型"><a href="#1、语言模型" class="headerlink" title="1、语言模型"></a>1、语言模型</h2><p>与多层感知机和能有效处理空间信息的卷积神经网络不同，循环神经网络是为了更好地处理时序信息而设计的。引入状态变量来存储过去的信息，并用其与当前的输入共同决定当前的输出。</p>
<p>语言模型：可将自然语言文本看作一段离散的时间序列，假设一段长度为T的文本中的词依次为W1、W2….Wt，那么在离散的时间序列中，Wt可看作在时间步t的输出或标签。给定一个长度为T的词的序列，语言模型将计算该序列概率：P(W1,W2,….,Wt)。为计算该语言模型，需要先计算词的概率，以及一个词在给定前几个词的情况下的条件概率。</p>
<p>N元语法：计算和存储多个词的概率复杂度会呈指数级增加，故通过马尔可夫假设简化语言模型：一个词的出现只与前面N个词相关，即N阶马尔可夫链。</p>
<p>故基于n-1阶马尔可夫链，可将语言模型改写为：<br>$$<br>P(W1,W2,…,Wt) = ∏P（Wt|Wt-(n-1),…,Wt-1）<br>$$<br>以上也叫做n元语法，当n较小时，n元语法往往不准确，当n较大时，n元语法需要计算并存储大量的词频和多词相邻频率。</p>
<h2 id="2、循环神经网络"><a href="#2、循环神经网络" class="headerlink" title="2、循环神经网络"></a>2、循环神经网络</h2><p>循环神经网络：并非刚性地记忆所有固定长度的序列，而是通过隐藏状态来存储之前时间步的信息。利用之前的多层感知机，通过添加隐藏状态将其变成循环神经网络。</p>
<h3 id="不含隐藏状态的多层感知机："><a href="#不含隐藏状态的多层感知机：" class="headerlink" title="不含隐藏状态的多层感知机："></a>不含隐藏状态的多层感知机：</h3><p>样本数为n、输出个数为d的小批量数据样本X。设隐藏层的激活函数为f，则其输出H为：<br>$$<br>H = f（X Wxh + bh）<br>$$<br>其中隐藏层权重参数W为Rn*d，隐藏层偏差参数b为R1xh，h为隐藏单元个数。上式两者根据广播机制来相加，设输出层的输出个数为q，则输出层的输出为：<br>$$<br>O = HWhq +bq<br>$$<br>若为分类问题，可用softmax(O)来计算输出类别的概率分布。</p>
<h3 id="含隐藏状态的多层感知机"><a href="#含隐藏状态的多层感知机" class="headerlink" title="含隐藏状态的多层感知机"></a>含隐藏状态的多层感知机</h3><p>与上一个相区别的是，保存上一时间步的隐藏变量Ht-1，并引入一个新的权重参数Whh为Rhxh，该参数用于描述在当前时间步如何使用上一时间步的隐藏变量。<br>$$<br>Ht = f(Xt Wxh + Ht-1Whh +bh)<br>$$<br>与多层感知机相比，添加了隐藏变量来捕捉截至当前时间步的序列的历史信息，就像神经网络当前时间步的状态或记忆一样，因此也称为隐藏状态，由于在当前时间步使用了上一时间步的隐藏状态，因此计算是循环的。通过对上一次时间步的利用，其模型参数的数量不随时间步的增加而增长。将输入与前一时间步隐藏状态连结后，输入至一个激活函数为f的全连接层，该全连接层的输出就是当前时间步的隐藏状态，且模型参数为Wxh与Whh的连结，偏差为bh。</p>
<p><img src="/../../image/%E5%BE%AA%E7%8E%AF%E7%A5%9E%E7%BB%8F%E7%BD%91%E7%BB%9C/%E9%9A%90%E8%97%8F%E7%8A%B6%E6%80%81%E5%BE%AA%E7%8E%AF%E7%A5%9E%E7%BB%8F%E7%BD%91%E7%BB%9C.png" alt="隐藏状态循环神经网络"></p>
<h2 id="3、基于字符级神经网络的语言模型"><a href="#3、基于字符级神经网络的语言模型" class="headerlink" title="3、基于字符级神经网络的语言模型"></a>3、基于字符级神经网络的语言模型</h2><p><img src="/../../image/%E5%BE%AA%E7%8E%AF%E7%A5%9E%E7%BB%8F%E7%BD%91%E7%BB%9C/%E5%AD%97%E7%AC%A6%E7%BA%A7%E5%BE%AA%E7%8E%AF%E7%A5%9E%E7%BB%8F%E7%BD%91%E7%BB%9C.png" alt="字符级循环神经网络"></p>
<p>演示如何1基于当前与过去字符来预测下一个字符，训练时对每个时间步的输出层输出使用softmax运算，然后使用交叉熵损失函数来计算其与标签的误差。</p>
<h3 id="处理数据集"><a href="#处理数据集" class="headerlink" title="处理数据集"></a>处理数据集</h3><p>建立字符索引：将每个字符映射成一个从0开始的连续整数，又称索引以方便后续处理。为得索引，我们将数据集中所有不同字符取出来，并逐一映射到索引来构造字典。</p>
<p>时序数据采样：每次随机读取小批量样本和标签，样本包含连续的字符。例：时间步数为5，样本序列为5个字符：‘’想，要，有，直，升”，则其标签序列为这些字符在训练集中的下一个字符：“要，有，直，升，机”。</p>
<p>随机采样：每个样本为原始序列上任意截取的一段序列，相邻的两个随机小批量不一定相邻，故每次随机采样前都需要重新初始化隐藏状态。</p>
<p>相邻采样：令相邻的两个随机小批量在原始序列上的位置也相毗邻，此时可以用一个小批量最终时间步的隐藏状态来初始化下一个小批量的隐藏状态：如此循环造成的影响：1、训练模型时，只需在每个迭代周期开始时初始化隐藏状态；2、当多个小批量通过传递隐藏状态串联起来时，梯度计算将依赖串联起的序列，迭代次数增加，梯度开销会越来越大。</p>
<h3 id="one-hot向量"><a href="#one-hot向量" class="headerlink" title="one-hot向量"></a>one-hot向量</h3><p>使用one-hot向量将词表示成向量输入到神经网络：每个字符已经同一个从0到N-1的连续整数值索引一一对应。如果一个字符的索引是i，则其向量为全0的长为N的向量，仅将位置为i的元素设为1。</p>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br></pre></td><td class="code"><pre><span class="line"><span class="function"><span class="keyword">def</span> <span class="title">to_one_hot</span><span class="params">(X,size)</span>:</span></span><br><span class="line">    <span class="keyword">return</span> [nd.one_hot(x,size) <span class="keyword">for</span> x <span class="keyword">in</span> X.T]</span><br></pre></td></tr></table></figure>



<h3 id="构建模型"><a href="#构建模型" class="headerlink" title="构建模型"></a>构建模型</h3><p>先初始化模型参数，将隐藏单元个数num_hiddens作为超参数，</p>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br></pre></td><td class="code"><pre><span class="line">num_inputs, num_hiddens, num_outputs = vocab_size, <span class="number">256</span>, vocab_size</span><br><span class="line"></span><br><span class="line"><span class="function"><span class="keyword">def</span> <span class="title">get_params</span><span class="params">()</span>:</span></span><br><span class="line">    <span class="function"><span class="keyword">def</span> <span class="title">_one</span><span class="params">(shape)</span>:</span></span><br><span class="line">        <span class="keyword">return</span> nd.random.normal(scale=o.o1, shape=shape, ctx=ctx)</span><br><span class="line">    <span class="comment">#先定义隐藏层参数</span></span><br><span class="line">    W_xh = _one((num_inputs, num_hiddens))</span><br><span class="line">    W_hh = _one((num_hiddens, num_hiddens))</span><br><span class="line">    b_h = nd.zeros(num_hiddens, ctx=ctx)</span><br><span class="line">    <span class="comment">#输出层参数</span></span><br><span class="line">    W_hq = _one((num_hiddens, num_outputs))</span><br><span class="line">    b_q = nd.zeros(num_outputs, ctx=ctx)</span><br><span class="line">    <span class="comment">#附上梯度</span></span><br><span class="line">    params = [W_xh, W_hh, b_h, W_hq, b_q]</span><br><span class="line">    <span class="keyword">for</span> param <span class="keyword">in</span> params:</span><br><span class="line">        param.attach_grad()</span><br><span class="line">    <span class="keyword">return</span> params</span><br></pre></td></tr></table></figure>

<p>定义init_rnn_state函数来返回初始化的隐藏状态，返回由一个形状为（批量大小，隐藏单元个数）的值为0的NDArray组成的元组。</p>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br></pre></td><td class="code"><pre><span class="line"><span class="function"><span class="keyword">def</span> <span class="title">init_rnn_state</span><span class="params">(batch_size, num_hiddens, ctx)</span>:</span></span><br><span class="line">    <span class="keyword">return</span> (nd.zeros(shape=(batch_size, num_hiddens), ctx=ctx))</span><br></pre></td></tr></table></figure>

<p>定义rnn函数来在一个时间步中计算隐藏状态与输出，激活函数使用tanh</p>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br></pre></td><td class="code"><pre><span class="line"><span class="function"><span class="keyword">def</span> <span class="title">rnn</span><span class="params">(inputs, state, params)</span>:</span></span><br><span class="line">    <span class="comment">#inputs和outputs皆为num_steps个形状为（batch_size, vocab_size)的矩阵</span></span><br><span class="line">    W_xh, W_hh, b_h, W_hq, b_q = params</span><br><span class="line">    H, = state</span><br><span class="line">    outputs = []</span><br><span class="line">    <span class="keyword">for</span> X <span class="keyword">in</span> inputs:</span><br><span class="line">        H = nd.tanh(nd.dot(X, W_xh) + nd.dot(H, W_hh) + b_h)</span><br><span class="line">        Y = nd.dot(H, W_hq) + b_q</span><br><span class="line">        outputs.append(Y)</span><br><span class="line">    <span class="keyword">return</span> outputs, (H,)</span><br></pre></td></tr></table></figure>



<h3 id="定义预测函数"><a href="#定义预测函数" class="headerlink" title="定义预测函数"></a>定义预测函数</h3><p>基于前缀prefix（含有数个字符的字符串）来预测接下来的num_chars个字符:</p>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br></pre></td><td class="code"><pre><span class="line"><span class="function"><span class="keyword">def</span> <span class="title">predict_rnn</span><span class="params">(prefix, num_chars, rnn, params, init_rnn_state, </span></span></span><br><span class="line"><span class="function"><span class="params">                num_hiddens, vocab_size, ctx, idx_to_char, char_to_idx)</span>:</span></span><br><span class="line">    state = init_rnn_state(<span class="number">1</span>, num_hiddens, ctx)</span><br><span class="line">    output = [char_to_idx[prefix[<span class="number">0</span>]]]</span><br><span class="line">    <span class="keyword">for</span> t <span class="keyword">in</span> range(num_chars + len(prefix) - <span class="number">1</span>):</span><br><span class="line">        <span class="comment">#将上一时间步的输出作为当前时间步的输入</span></span><br><span class="line">        X = to_onehot(nd.array([output[<span class="number">-1</span>]], ctx=ctx), vocab_size)</span><br><span class="line">        <span class="comment">#计算输出和更新隐藏状态</span></span><br><span class="line">        (Y, state) = rnn(X, state, params)</span><br><span class="line">        <span class="comment">#下一个时间步的输入是prefix里的字符或者当前的最佳预测字符</span></span><br><span class="line">        <span class="keyword">if</span> t &lt; len(prefix) - <span class="number">1</span>:</span><br><span class="line">            output.append(char_to_idx[prefix[t + <span class="number">1</span>]])</span><br><span class="line">        <span class="keyword">else</span>:</span><br><span class="line">            output.append(int(Y[<span class="number">0</span>].argmax(axis=<span class="number">1</span>).asscalar()))</span><br><span class="line">        <span class="keyword">return</span> <span class="string">''</span>.join([idx_to_char[i] <span class="keyword">for</span> i <span class="keyword">in</span> output])</span><br></pre></td></tr></table></figure>



<h3 id="裁剪梯度"><a href="#裁剪梯度" class="headerlink" title="裁剪梯度"></a>裁剪梯度</h3><p>利用裁剪梯度以应对梯度爆炸，假设将所有模型参数梯度的元素拼接成一个向量g，并设裁剪的阈值为s;裁剪后的梯度的L2范数不超过s<br>$$<br>裁剪后的梯度为 min(s/||g|| , 1)g<br>$$</p>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br></pre></td><td class="code"><pre><span class="line"><span class="function"><span class="keyword">def</span> <span class="title">grad_clipping</span><span class="params">(params, theta, ctx)</span>:</span></span><br><span class="line">    norm = nd.array([<span class="number">0</span>], ctx)</span><br><span class="line">    <span class="keyword">for</span> param <span class="keyword">in</span> params:</span><br><span class="line">        norm += (param.grad **<span class="number">2</span>).sum()</span><br><span class="line">    norm = norm.sqrt().asscalar()</span><br><span class="line">    <span class="keyword">if</span> norm &gt; theta:</span><br><span class="line">        <span class="keyword">for</span> param <span class="keyword">in</span> params:</span><br><span class="line">            param.grad[:] *= theta / norm</span><br></pre></td></tr></table></figure>



<h3 id="模型训练函数"><a href="#模型训练函数" class="headerlink" title="模型训练函数"></a>模型训练函数</h3><p>困惑度用于评价语言模型的好坏，是对交叉熵损失函数做指数运算后得到的值。</p>
<p>与之前章节的训练函数相比，该模型训练有几点不同：1、用困惑度评价模型；2、在迭代模型参数前裁剪梯度；3、对时序数据采用不同的采样方法将导致隐藏状态初始化的不同。</p>
<h3 id="使用Gluon进行模型的简洁实现"><a href="#使用Gluon进行模型的简洁实现" class="headerlink" title="使用Gluon进行模型的简洁实现"></a>使用Gluon进行模型的简洁实现</h3><p>Gluon的rnn模块提供了循环神经网络的实现，</p>
<h3 id="时间反向传播"><a href="#时间反向传播" class="headerlink" title="时间反向传播"></a>时间反向传播</h3><p>不裁剪梯度时，模型将无法正常训练。我们将循环神经网络按时间步展开，从而得到模型变量与参数之间的依赖关系，并根据链式法则应用反向传播计算并存储梯度。</p>
<p><img src="/../../image/%E5%BE%AA%E7%8E%AF%E7%A5%9E%E7%BB%8F%E7%BD%91%E7%BB%9C/%E5%8F%8D%E5%90%91%E4%BC%A0%E6%92%AD.png" alt="反向传播"></p>
<p>每次迭代中，我们在依次计算完以上各个梯度后，会将它们存储起来，从而避免重复计算。同时，反向传播中的梯度计算可能会依赖变量的当前值，他们正是通过正向传播计算出来的。</p>
<h2 id="4、门控循环单元"><a href="#4、门控循环单元" class="headerlink" title="4、门控循环单元"></a>4、门控循环单元</h2><p>裁剪梯度可以应对梯度爆炸，但是没办法解决梯度衰减的问题。因此，循环神经网络在实际中很难捕捉时间序列中时间步距离较大的依赖关系。GRU门控循环神经网络通过可以学习的门来控制信息的流动。</p>
<h3 id=""><a href="#" class="headerlink" title=""></a></h3><h3 id="重置门和更新门"><a href="#重置门和更新门" class="headerlink" title="重置门和更新门"></a>重置门和更新门</h3><p>两者输入均为当前时间步输入Xt与上一时间步隐藏状态Ht-1，输出由激活函数为sigmoid函数的全连接层计算得到</p>
<p><img src="/../../image/%E5%BE%AA%E7%8E%AF%E7%A5%9E%E7%BB%8F%E7%BD%91%E7%BB%9C/%E9%87%8D%E7%BD%AE%E9%97%A8%E4%B8%8E%E6%9B%B4%E6%96%B0%E9%97%A8.png" alt="重置门与更新门"></p>
<h3 id="候选隐藏状态"><a href="#候选隐藏状态" class="headerlink" title="候选隐藏状态"></a>候选隐藏状态</h3><p>GRU将计算候选隐藏状态来辅助稍后的隐藏状态计算，先将当前时间步重置门的输出与上一时间步重置门隐藏状态做按元素乘法。重置门中元素值接近0，则意味重置对应隐藏状态元素为0，则丢弃上一时间步的隐藏状态；若近似1，则保留上一时间步隐藏状态。</p>
<p>之后与当前时间步的输出连结，再通过含t激活函数tanh的全连接层计算候选隐藏状态，其所有元素值域为[-1,1]</p>
<h3 id="-1"><a href="#-1" class="headerlink" title=""></a><img src="/../../image/%E5%BE%AA%E7%8E%AF%E7%A5%9E%E7%BB%8F%E7%BD%91%E7%BB%9C/%E5%80%99%E9%80%89%E9%9A%90%E8%97%8F%E7%8A%B6%E6%80%81.png" alt="候选隐藏状态"></h3><h3 id="隐藏状态"><a href="#隐藏状态" class="headerlink" title="隐藏状态"></a>隐藏状态</h3><p>最后，时间步的隐藏状态Ht的计算使用了当前时间步的更新门Zt来对上一时间步的隐藏状态Ht-1和当前时间步的候选隐藏状态Ht*来做组合：</p>
<p><img src="/../../image/%E5%BE%AA%E7%8E%AF%E7%A5%9E%E7%BB%8F%E7%BD%91%E7%BB%9C/%E9%9A%90%E8%97%8F%E7%8A%B6%E6%80%81.png" alt="隐藏状态"></p>
<p>在Gluon中可直接调用rnn模块中的GRU类来实现GRU门控循环。</p>
<h3 id="LSTM长短期记忆"><a href="#LSTM长短期记忆" class="headerlink" title="LSTM长短期记忆"></a>LSTM长短期记忆</h3><p>LSTM引入了三个门：输入门、遗忘门、输出门</p>
<p>在Gluon中可调用rnn模块中的LSTM类来实现长短期记忆。</p>
<h2 id="5、深度循环神经网络"><a href="#5、深度循环神经网络" class="headerlink" title="5、深度循环神经网络"></a>5、深度循环神经网络</h2><p>深度循环神经网络：目前为止介绍的循环神经网络只有一个单项的隐藏层，深度学习中通常会用到含多个隐藏层的循环神经网络，在该网络中，隐藏状态的信息不断传递至当前层的下一时间步和当前时间步的下一层。</p>
<p>双向循环神经网络：之前的介绍的神经网络模型都是假设当前时间步由前面的较早时间步的序列来决定，因此信息均通过隐藏状态向后传递。故双向神经网络通过增加从后往前传递信息的隐藏层来更灵活地处理这类信息。</p>

    </div>
    
    
    
    
    <div>
      
        <div>
    
        <div style="text-align:center;color: #ccc;font-size:14px;">-------------本文结束<i class="fa fa-paw"></i>感谢您的阅读-------------</div>
    
</div>

      
    </div>
        <div class="reward-container">
  <div>坚持原创技术分享，您的支持将鼓励我继续创作！</div>
  <button onclick="var qr = document.getElementById('qr'); qr.style.display = (qr.style.display === 'none') ? 'block' : 'none';">
    打赏
  </button>
  <div id="qr" style="display: none;">
      
      <div style="display: inline-block;">
        <img src="/images/alipay.png" alt="Li Yudong 支付宝">
        <p>支付宝</p>
      </div>

  </div>
</div>

        

<div>
<ul class="post-copyright">
  <li class="post-copyright-author">
    <strong>本文作者： </strong>Li Yudong
  </li>
  <li class="post-copyright-link">
    <strong>本文链接：</strong>
    <a href="http://example.com/2020/08/21/%E6%9C%BA%E5%99%A8%E5%AD%A6%E4%B9%A0/%E5%BE%AA%E7%8E%AF%E7%A5%9E%E7%BB%8F%E7%BD%91%E7%BB%9C/" title="循环神经网络">http://example.com/2020/08/21/机器学习/循环神经网络/</a>
  </li>
  <li class="post-copyright-license">
    <strong>版权声明： </strong>本博客所有文章除特别声明外，均采用 <a href="https://creativecommons.org/licenses/by-nc-sa/4.0/" rel="noopener" target="_blank"><i class="fab fa-fw fa-creative-commons"></i>BY-NC-SA</a> 许可协议。转载请注明出处！
  </li>
</ul>
</div>

    
      <footer class="post-footer">
          <div class="post-tags">
              <a href="/tags/%E6%9C%BA%E5%99%A8%E5%AD%A6%E4%B9%A0%E5%88%9D%E5%AD%A6/" rel="tag"># 机器学习初学</a>
          </div>
    
        

    
        
    <div class="post-nav">
      <div class="post-nav-item">
    <a href="/2020/08/20/%E6%9C%BA%E5%99%A8%E5%AD%A6%E4%B9%A0/%E5%8D%B7%E7%A7%AF%E7%A5%9E%E7%BB%8F%E7%BD%91%E7%BB%9C/" rel="prev" title="卷积神经网络">
      <i class="fa fa-chevron-left"></i> 卷积神经网络
    </a></div>
      <div class="post-nav-item">
    <a href="/2020/08/25/C++/C%E8%AF%AD%E8%A8%80%E7%A8%8B%E5%BA%8F%E8%AE%BE%E8%AE%A1/" rel="next" title="C语言程序设计">
      C语言程序设计 <i class="fa fa-chevron-right"></i>
    </a></div>
    </div>
      </footer>
    
  </article>
  
  
  



          </div>
          

<script>
  window.addEventListener('tabs:register', () => {
    let { activeClass } = CONFIG.comments;
    if (CONFIG.comments.storage) {
      activeClass = localStorage.getItem('comments_active') || activeClass;
    }
    if (activeClass) {
      let activeTab = document.querySelector(`a[href="#comment-${activeClass}"]`);
      if (activeTab) {
        activeTab.click();
      }
    }
  });
  if (CONFIG.comments.storage) {
    window.addEventListener('tabs:click', event => {
      if (!event.target.matches('.tabs-comment .tab-content .tab-pane')) return;
      let commentClass = event.target.classList[1];
      localStorage.setItem('comments_active', commentClass);
    });
  }
</script>

        </div>
          
  
  <div class="toggle sidebar-toggle">
    <span class="toggle-line toggle-line-first"></span>
    <span class="toggle-line toggle-line-middle"></span>
    <span class="toggle-line toggle-line-last"></span>
  </div>

  <aside class="sidebar">
    <div class="sidebar-inner">

      <ul class="sidebar-nav motion-element">
        <li class="sidebar-nav-toc">
          文章目录
        </li>
        <li class="sidebar-nav-overview">
          站点概览
        </li>
      </ul>

      <!--noindex-->
      <div class="post-toc-wrap sidebar-panel">
          <div class="post-toc motion-element"><ol class="nav"><li class="nav-item nav-level-2"><a class="nav-link" href="#1、语言模型"><span class="nav-number">1.</span> <span class="nav-text">1、语言模型</span></a></li><li class="nav-item nav-level-2"><a class="nav-link" href="#2、循环神经网络"><span class="nav-number">2.</span> <span class="nav-text">2、循环神经网络</span></a><ol class="nav-child"><li class="nav-item nav-level-3"><a class="nav-link" href="#不含隐藏状态的多层感知机："><span class="nav-number">2.1.</span> <span class="nav-text">不含隐藏状态的多层感知机：</span></a></li><li class="nav-item nav-level-3"><a class="nav-link" href="#含隐藏状态的多层感知机"><span class="nav-number">2.2.</span> <span class="nav-text">含隐藏状态的多层感知机</span></a></li></ol></li><li class="nav-item nav-level-2"><a class="nav-link" href="#3、基于字符级神经网络的语言模型"><span class="nav-number">3.</span> <span class="nav-text">3、基于字符级神经网络的语言模型</span></a><ol class="nav-child"><li class="nav-item nav-level-3"><a class="nav-link" href="#处理数据集"><span class="nav-number">3.1.</span> <span class="nav-text">处理数据集</span></a></li><li class="nav-item nav-level-3"><a class="nav-link" href="#one-hot向量"><span class="nav-number">3.2.</span> <span class="nav-text">one-hot向量</span></a></li><li class="nav-item nav-level-3"><a class="nav-link" href="#构建模型"><span class="nav-number">3.3.</span> <span class="nav-text">构建模型</span></a></li><li class="nav-item nav-level-3"><a class="nav-link" href="#定义预测函数"><span class="nav-number">3.4.</span> <span class="nav-text">定义预测函数</span></a></li><li class="nav-item nav-level-3"><a class="nav-link" href="#裁剪梯度"><span class="nav-number">3.5.</span> <span class="nav-text">裁剪梯度</span></a></li><li class="nav-item nav-level-3"><a class="nav-link" href="#模型训练函数"><span class="nav-number">3.6.</span> <span class="nav-text">模型训练函数</span></a></li><li class="nav-item nav-level-3"><a class="nav-link" href="#使用Gluon进行模型的简洁实现"><span class="nav-number">3.7.</span> <span class="nav-text">使用Gluon进行模型的简洁实现</span></a></li><li class="nav-item nav-level-3"><a class="nav-link" href="#时间反向传播"><span class="nav-number">3.8.</span> <span class="nav-text">时间反向传播</span></a></li></ol></li><li class="nav-item nav-level-2"><a class="nav-link" href="#4、门控循环单元"><span class="nav-number">4.</span> <span class="nav-text">4、门控循环单元</span></a><ol class="nav-child"><li class="nav-item nav-level-3"><a class="nav-link" href="#"><span class="nav-number">4.1.</span> <span class="nav-text"></span></a></li><li class="nav-item nav-level-3"><a class="nav-link" href="#重置门和更新门"><span class="nav-number">4.2.</span> <span class="nav-text">重置门和更新门</span></a></li><li class="nav-item nav-level-3"><a class="nav-link" href="#候选隐藏状态"><span class="nav-number">4.3.</span> <span class="nav-text">候选隐藏状态</span></a></li><li class="nav-item nav-level-3"><a class="nav-link" href="#-1"><span class="nav-number">4.4.</span> <span class="nav-text"></span></a></li><li class="nav-item nav-level-3"><a class="nav-link" href="#隐藏状态"><span class="nav-number">4.5.</span> <span class="nav-text">隐藏状态</span></a></li><li class="nav-item nav-level-3"><a class="nav-link" href="#LSTM长短期记忆"><span class="nav-number">4.6.</span> <span class="nav-text">LSTM长短期记忆</span></a></li></ol></li><li class="nav-item nav-level-2"><a class="nav-link" href="#5、深度循环神经网络"><span class="nav-number">5.</span> <span class="nav-text">5、深度循环神经网络</span></a></li></ol></div>
      </div>
      <!--/noindex-->

      <div class="site-overview-wrap sidebar-panel">
        <div class="site-author motion-element" itemprop="author" itemscope itemtype="http://schema.org/Person">
    <img class="site-author-image" itemprop="image" alt="Li Yudong"
      src="/images/author.jpg">
  <p class="site-author-name" itemprop="name">Li Yudong</p>
  <div class="site-description" itemprop="description"></div>
</div>
<div class="site-state-wrap motion-element">
  <nav class="site-state">
      <div class="site-state-item site-state-posts">
          <a href="/archives/">
        
          <span class="site-state-item-count">95</span>
          <span class="site-state-item-name">日志</span>
        </a>
      </div>
      <div class="site-state-item site-state-categories">
            <a href="/categories/">
          
        <span class="site-state-item-count">14</span>
        <span class="site-state-item-name">分类</span></a>
      </div>
      <div class="site-state-item site-state-tags">
            <a href="/tags/">
          
        <span class="site-state-item-count">18</span>
        <span class="site-state-item-name">标签</span></a>
      </div>
  </nav>
</div>
  <div class="links-of-author motion-element">
      <span class="links-of-author-item">
        <a href="https://github.com/tiarmor1" title="GitHub → https:&#x2F;&#x2F;github.com&#x2F;tiarmor1" rel="noopener" target="_blank"><i class="fab fa-github fa-fw"></i>GitHub</a>
      </span>
      <span class="links-of-author-item">
        <a href="mailto:1157019137@qq.com" title="E-Mail → mailto:1157019137@qq.com" rel="noopener" target="_blank"><i class="fa fa-envelope fa-fw"></i>E-Mail</a>
      </span>
  </div>
  <div class="cc-license motion-element" itemprop="license">
    <a href="https://creativecommons.org/licenses/by-nc-sa/4.0/" class="cc-opacity" rel="noopener" target="_blank"><img src="/images/cc-by-nc-sa.svg" alt="Creative Commons"></a>
  </div>



      </div>

    </div>
  </aside>
  <div id="sidebar-dimmer"></div>


      </div>
    </main>

    <footer class="footer">
      <div class="footer-inner">
        

        

<div class="copyright">
  
  &copy; 2020 – 
  <span itemprop="copyrightYear">2022</span>
  <span class="with-love">
    <i class="fa fa-heart"></i>
  </span>
  <span class="author" itemprop="copyrightHolder">Li Yudong</span>
</div>

        








      </div>
    </footer>
  </div>

  
  
  <script color='0,0,0' opacity='0.5' zIndex='-1' count='150' src="/lib1/canvas-nest/canvas-nest.min.js"></script>
  <script src="/lib1/anime.min.js"></script>
  <script src="/lib1/velocity/velocity.min.js"></script>
  <script src="/lib1/velocity/velocity.ui.min.js"></script>

<script src="/js/utils.js"></script>

<script src="/js/motion.js"></script>


<script src="/js/schemes/pisces.js"></script>


<script src="/js/next-boot.js"></script>




  




  
<script src="/js/local-search.js"></script>













  

  

  
   <canvas class="fireworks" style="position: fixed;left: 0;top: 0;z-index: 1; pointer-events: none;" ></canvas> 
   <script type="text/javascript" src="//cdn.bootcss.com/animejs/2.2.0/anime.min.js"></script> 
   <script type="text/javascript" src="/js/fireworks.js"></script>
  
</body>
</html>

